import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import math
sns.set()

 filepath = r'C:\Users\azuka\Drive Folder\ML projects\Leapsapp\browsing or purchasing\Rename.csv'
data = pd.read_csv(filepath, sep=';')
# data

 def check_values(df):
    col_desc = []
    data = {
        'features': [col for col in df.columns],
        'data_type': [df[col].dtype for col in df.columns],
        'nan_total': [df[col].isna().sum() for col in df.columns],
        'nan_pct': [round(df[col].isna().sum()/len(df)*100,2) for col in df.columns],
        'unique': [df[col].nunique() for col in df.columns],
        'values_ex': [df[col].drop_duplicates().sample(df[col].nunique()).values if df[col].nunique() <= 5 else df[col].drop_duplicates().sample(2).values for col in df.columns]
    }
    return pd.DataFrame(data)

 check_values(data)

 num_cols = [x for x in data.iloc[:,:10].columns]
print('num_cols:', num_cols)
cat_cols = [x for x in data.iloc[:,10:-1].columns]
print('cat_cols:', cat_cols)

num_cols: ['Administrative', 'AdministrativeDuration', 'Informational', 'InformationalDuration', 'ProductRelated', 'ProductRelatedDuration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
cat_cols: ['OperatingSystems', 'Browser', 'Region', 'TrafficType', 'Weekend', 'Month', 'VisitorType']

 def plot_kdeplot(features):
    feat_len = len(features)
    if feat_len > 3:
        nrows = math.ceil(feat_len/4)
        ncols = math.ceil(feat_len/3)
    else:
        nrows=3
        ncols=4
    
    fig, ax = plt.subplots(nrows, ncols, figsize=(ncols*4, nrows*3))
    for i, col in enumerate(features):
        if feat_len > 3:
            plt.subplot(nrows,ncols,i+1)
            g=sns.kdeplot(data[data.HasPurchased==1][col], label='Purchase')
            g=sns.kdeplot(data[data.HasPurchased==0][col], label='No Puchase')
            g.legend()
        else:
            sns.kdeplot(data[data.HasPurchased==1][col], label='Purchase', ax=ax[i])
            sns.kdeplot(data[data.HasPurchased==0][col], label='No Puchase', ax=ax[i])
            ax[i].legend()
    for j in range(nrows*ncols-feat_len):
        if feat_len > 3:
            ax[-1,-(j+1)].axis('off')
        else: ax[-(j+1)].axis('off')
    plt.tight_layout()

 %%time
plot_kdeplot(num_cols)
plt.show()

Wall time: 56.2 s

 data.HasPurchased.replace({0:'No Purchase', 1:'Purchase'}).value_counts(ascending=True).plot.barh()
plt.show()

 from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate, GridSearchCV, ShuffleSplit, learning_curve, StratifiedShuffleSplit
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report, plot_roc_curve, plot_precision_recall_curve, \
precision_recall_curve, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay, precision_score, f1_score, accuracy_score
from imblearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, EditedNearestNeighbours
from imblearn.combine import SMOTETomek, SMOTEENN

from sklearn.preprocessing import MinMaxScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from xgboost.sklearn import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import RFE
import category_encoders as ce
from sklearn.compose import ColumnTransformer
import pickle

 df_train = data.copy()

cat_cols = [x for x in cat_cols if x != 'Weekend']

for col in cat_cols:
    df_train[col] = df_train[col].astype('object')

df_train = pd.get_dummies(df_train, drop_first=True)

X = df_train.drop(columns=['HasPurchased'])
y = df_train['HasPurchased']

X_train_val, X_test, y_train_val, y_test = train_test_split(X,y, random_state=26, stratify=y)

 print(classification_report(y_test, np.zeros(len(y_test))))

              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92      2606
         1.0       0.00      0.00      0.00       477

    accuracy                           0.85      3083
   macro avg       0.42      0.50      0.46      3083
weighted avg       0.71      0.85      0.77      3083

C:\Users\azuka\anaconda3-1\envs\py38\lib\site-packages\sklearn\metrics\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\azuka\anaconda3-1\envs\py38\lib\site-packages\sklearn\metrics\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\azuka\anaconda3-1\envs\py38\lib\site-packages\sklearn\metrics\_classification.py:1245: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

 lr_baseline = LogisticRegression(max_iter=500)
lr_baseline.fit(X_train_val, y_train_val)
print(classification_report(y_test, lr_baseline.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      2606
         1.0       0.70      0.37      0.48       477

    accuracy                           0.88      3083
   macro avg       0.79      0.67      0.71      3083
weighted avg       0.86      0.88      0.86      3083

C:\Users\azuka\anaconda3-1\envs\py38\lib\site-packages\sklearn\linear_model\_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

 dtc_baseline = DecisionTreeClassifier(max_depth=1, random_state=26)
dtc_baseline.fit(X_train_val, y_train_val)
print(classification_report(y_test, dtc_baseline.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.96      0.89      0.93      2606
         1.0       0.58      0.79      0.67       477

    accuracy                           0.88      3083
   macro avg       0.77      0.84      0.80      3083
weighted avg       0.90      0.88      0.89      3083

 def pipe_model(models):
    pipelines = []
    for model in models:
        pl = Pipeline([
        # ('transfr', transformer),
        # ('scaler', MinMaxScaler()),
#         ('rfe', RFE(model[1])),
#         ('sampling', RandomOverSampler(sampling_strategy=0.1)),
#         ('sampling2', RandomUnderSampler(sampling_strategy=0.5)),
        # ('sampling', SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))),
        ('model', model[1])
])
        pipelines.append(pl)
    return pipelines

 def pipe_scale_model(models):
    pipelines = []
    for model in models:
        pl = Pipeline([
        # ('transfr', transformer),
        ('scaler', RobustScaler()),
#         ('rfe', RFE(model[1])),
#         ('sampling', RandomOverSampler(sampling_strategy=0.1)),
#         ('sampling2', RandomUnderSampler(sampling_strategy=0.5)),
        # ('sampling', SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))),
        ('model', model[1])
])
        pipelines.append(pl)
    return pipelines

 def pipe_smote_model(models):
    pipelines = []
    for model in models:
        pl = Pipeline([
        # ('transfr', transformer),
        # ('scaler', RobustScaler()),
#         ('rfe', RFE(model[1])),
        ('sampling', SMOTE()),
#         ('sampling2', RandomUnderSampler(sampling_strategy=0.5)),
        # ('sampling', SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))),
        ('model', model[1])
])
        pipelines.append(pl)
    return pipelines

 def pipe_scale_smote_model(models):
    pipelines = []
    for model in models:
        pl = Pipeline([
        # ('transfr', transformer),
        ('scaler', MinMaxS()),
#         ('rfe', RFE(model[1])),
        ('sampling', SMOTE()),
#         ('sampling2', RandomUnderSampler(sampling_strategy=0.5)),
        # ('sampling', SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'))),
        ('model', model[1])
])
        pipelines.append(pl)
    return pipelines

 def model_evaluation(pipes, metric):
    skfold = StratifiedKFold(n_splits=5, random_state=26, shuffle=True)
    model_cvs = []
    for pipe in pipes:
        model_cv = cross_validate(pipe, X_train_val, y_train_val, cv=skfold, scoring=metric, return_train_score=True, n_jobs=4)
        model_cvs.append(model_cv)
    return model_cvs

def results_dataframe(models, model_cvs, metric):
    scores = np.zeros((len(models),len(metric)*2))
    j = 0
    for model, model_cv in zip(models, model_cvs):
        
        model_score = []
        for i in model_cv:
            
            if i == 'fit_time' or i == 'score_time':
                continue
            model_score.append(model_cv[i].mean())
        scores[j] = model_score
        j+=1
    return pd.DataFrame(scores, index=[i[0] for i in models], columns = [i for i in list(model_cv_1[0].keys()) if i not in ('fit_time', 'score_time')])

 models_1 = [('logreg', LogisticRegression()),('tree', DecisionTreeClassifier(random_state=26)), ('sgdc', SGDClassifier(random_state=26)), ('nb', GaussianNB())]
metric = ('f1', 'accuracy', 'recall', 'precision')
pl_1 = pipe_model(models_1)
model_cv_1 = model_evaluation(pl_1, metric)

df_model = results_dataframe(models_1, model_cv_1, metric).T
df_model

 models_2 = [('logreg', LogisticRegression()),('tree', DecisionTreeClassifier(random_state=26)), ('sgdc', SGDClassifier(random_state=26)), ('nb', GaussianNB())]
metric = ('f1', 'accuracy', 'recall', 'precision')
pl_2 = pipe_scale_model(models_2)
model_cv_2 = model_evaluation(pl_2, metric)

df_model2 = results_dataframe(models_2, model_cv_2, metric).T
df_model2

 models_4 = [('logreg', LogisticRegression()),('tree', DecisionTreeClassifier(random_state=26)), ('sgdc', SGDClassifier(random_state=26)), ('nb', GaussianNB())]
metric = ('f1', 'accuracy', 'recall', 'precision')
pl_4 = pipe_smote_model(models_4)
model_cv_4 = model_evaluation(pl_4, metric)

df_model4 = results_dataframe(models_4, model_cv_4, metric).T
df_model4

 models_3 = [('logreg', LogisticRegression()),('tree', DecisionTreeClassifier(random_state=26)), ('sgdc', SGDClassifier(random_state=26)), ('nb', GaussianNB())]
metric = ('f1', 'accuracy', 'recall', 'precision')
pl_3 = pipe_scale_smote_model(models_3)
model_cv_3 = model_evaluation(pl_3, metric)

df_model3 = results_dataframe(models_3, model_cv_3, metric).T
df_model3

 def summary_df(name):
    return pd.concat([df_model[[name]].rename(columns={name:'model'}),
           df_model2[[name]].rename(columns={name:'Scaling+model'}),
           df_model4[[name]].rename(columns={name:'SMOTE+model'}),
           df_model3[[name]].rename(columns={name:'Scaling+SMOTE+model'})], axis=1).round(3)

 pd.concat([summary_df('logreg').loc['test_f1'].rename('logreg'),\
    summary_df('tree').loc['test_f1'].rename('tree'),
    summary_df('sgdc').loc['test_f1'].rename('sgdc'),
    summary_df('nb').loc['test_f1'].rename('nb')], axis=1)

 summary_df('tree')

 summary_df('logreg')

 # pl_3[0].set_params(model__solver='saga', model__max_iter=500)

 pl_4[1]

Pipeline(steps=[('sampling', SMOTE()),
                ('model', DecisionTreeClassifier(random_state=26))])

 dtc_benchmark = pl_4[1]
dtc_benchmark.fit(X_train_val,y_train_val)
y_pred_dtc_benchmark = dtc_benchmark.predict(X_test)
print('DecisionTreeClassifier with SMOTE')
print(classification_report(y_test, y_pred_dtc_benchmark))

DecisionTreeClassifier with SMOTE
              precision    recall  f1-score   support

         0.0       0.93      0.91      0.92      2606
         1.0       0.56      0.61      0.58       477

    accuracy                           0.86      3083
   macro avg       0.74      0.76      0.75      3083
weighted avg       0.87      0.86      0.87      3083

 sns.heatmap(confusion_matrix(y_test, y_pred_dtc_benchmark), fmt='d', cmap='viridis', annot=True, annot_kws={"size": 15})
plt.xticks([0.5,1.5], [0,1], size=15)
plt.yticks([0.5,1.5], [0,1], size=15)
plt.ylabel('True label', size=15)
plt.xlabel('Predicted label', size=15)
plt.show()

 def plot_learning_curve_scoring(estimator, title, X, y, axes=None, ylim=None, cv=None, scoring=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
#     if axes is None:
#         _, axes = plt.subplots(1, 3, figsize=(20, 5))
    _, axes = plt.subplots()

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes, scoring=scoring,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")

    return plt

 %%time
title = r"Learning Curves (Decision Tree Classifier)"
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
dtc_benchmark = pl_3[1]
plot_learning_curve_scoring(dtc_benchmark, title, X_train_val, y_train_val, #axes=axes[:, 2], ylim=(0.7, 1.01),
                    cv=cv, scoring='f1', n_jobs=4)
plt.show()

Wall time: 1min 1s

 %%time
# tree = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=26)

X_train_val_samp, y_train_val_samp = SMOTE().fit_resample(X_train_val, y_train_val)
adaboost.fit(X_train_val_samp, y_train_val_samp)

Wall time: 10.2 s

AdaBoostClassifier(learning_rate=0.1, random_state=26)

 print(classification_report(y_test, adaboost.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.96      0.89      0.92      2606
         1.0       0.57      0.81      0.67       477

    accuracy                           0.88      3083
   macro avg       0.77      0.85      0.80      3083
weighted avg       0.90      0.88      0.89      3083

 %%time
title = r"Learning Curves (Adaboost)"
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
plot_learning_curve_scoring(adaboost, title, X_train_val_samp, y_train_val_samp, #axes=axes[:, 2], ylim=(0.7, 1.01),
                    cv=cv, scoring='f1', n_jobs=4)
plt.show()

Wall time: 2min 6s

 %%time
# tree = DecisionTreeClassifier()
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=26)

gbc.fit(X_train_val_samp, y_train_val_samp)

Wall time: 17.6 s

GradientBoostingClassifier(n_estimators=50, random_state=26)

 y_pred_gbc = gbc.predict(X_test)
print(classification_report(y_test, y_pred_gbc))

              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93      2606
         1.0       0.62      0.75      0.68       477

    accuracy                           0.89      3083
   macro avg       0.79      0.83      0.81      3083
weighted avg       0.90      0.89      0.89      3083

 %%time
title = r"Learning Curves (Gradient Boosting)"
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
plot_learning_curve_scoring(gbc, title, X_train_val_samp, y_train_val_samp, #axes=axes[:, 2], ylim=(0.7, 1.01),
                    cv=cv, scoring='f1', n_jobs=4)
plt.show()

Wall time: 2min 29s

 %%time
# tree = DecisionTreeClassifier()
rfc = RandomForestClassifier(n_estimators=50, random_state=26)

rfc.fit(X_train_val_samp, y_train_val_samp)

Wall time: 12.1 s

RandomForestClassifier(n_estimators=50, random_state=26)

 print(classification_report(y_test, rfc.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      2606
         1.0       0.67      0.63      0.65       477

    accuracy                           0.90      3083
   macro avg       0.80      0.79      0.79      3083
weighted avg       0.89      0.90      0.89      3083

 %%time
title = r"Learning Curves (Random Forest)"
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
plot_learning_curve_scoring(rfc, title, X_train_val_samp, y_train_val_samp, #axes=axes[:, 2], ylim=(0.7, 1.01),
                    cv=cv, scoring='f1', n_jobs=4)
plt.show()

Wall time: 2min 16s

 %%time
# tree = DecisionTreeClassifier()
etc = ExtraTreesClassifier(n_estimators=50, random_state=26)

etc.fit(X_train_val_samp, y_train_val_samp)

Wall time: 13.2 s

ExtraTreesClassifier(n_estimators=50, random_state=26)

 print(classification_report(y_test, rfc.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.93      0.94      0.94      2606
         1.0       0.67      0.63      0.65       477

    accuracy                           0.90      3083
   macro avg       0.80      0.79      0.79      3083
weighted avg       0.89      0.90      0.89      3083

 %%time
title = r"Learning Curves (Extra Trees)"
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.25, random_state=0)
plot_learning_curve_scoring(etc, title, X_train_val_samp, y_train_val_samp, #axes=axes[:, 2], ylim=(0.7, 1.01),
                    cv=cv, scoring='f1', n_jobs=4)
plt.show()

Wall time: 1min 34s

	logreg	tree	sgdc	nb
test_f1	0.504093	0.567794	0.449368	0.455454
train_f1	0.500112	1.000000	0.435770	0.471333
test_accuracy	0.882988	0.865146	0.793210	0.770414
train_accuracy	0.882394	1.000000	0.796209	0.777495
test_recall	0.385705	0.572323	0.468307	0.621208
train_recall	0.380675	1.000000	0.456469	0.640990
test_precision	0.729643	0.563962	0.686684	0.359917
train_precision	0.730167	1.000000	0.681902	0.373106

	logreg	tree	sgdc	nb
test_f1	0.502654	0.568392	0.543770	0.340904
train_f1	0.504524	1.000000	0.542851	0.345649
test_accuracy	0.883205	0.865470	0.874769	0.450963
train_accuracy	0.883773	1.000000	0.875149	0.455903
test_recall	0.382230	0.572325	0.487819	0.909137
train_recall	0.382423	1.000000	0.483740	0.919645
test_precision	0.736758	0.565082	0.656342	0.210253
train_precision	0.741252	1.000000	0.664353	0.213386

	logreg	tree	sgdc	nb
test_f1	0.628132	0.548869	0.571692	0.323790
train_f1	0.626785	1.000000	0.576558	0.330885
test_accuracy	0.877257	0.854223	0.780127	0.687252
train_accuracy	0.876527	1.000000	0.783718	0.689386
test_recall	0.670149	0.573042	0.743617	0.483582
train_recall	0.669985	1.000000	0.749976	0.496333
test_precision	0.591368	0.528304	0.514674	0.243508
train_precision	0.588829	1.000000	0.518761	0.248202

	logreg	tree	sgdc	nb
test_f1	0.601827	0.559484	0.525610	0.321488
train_f1	0.610741	1.000000	0.541633	0.324431
test_accuracy	0.844275	0.857466	0.789554	0.417867
train_accuracy	0.847734	1.000000	0.794906	0.422515
test_recall	0.760303	0.584918	0.752007	0.889571
train_recall	0.771314	1.000000	0.779504	0.894835
test_precision	0.498299	0.536952	0.411132	0.196308
train_precision	0.505744	1.000000	0.421372	0.198241

	logreg	tree	sgdc	nb
model	0.504	0.568	0.449	0.455
Scaling+model	0.503	0.568	0.544	0.341
SMOTE+model	0.632	0.549	0.407	0.323
Scaling+SMOTE+model	0.595	0.550	0.534	0.321

Customer's Online Shopping Purchasing Intention¶

1. Dataset Summary¶

2. Goals¶

3. EDA¶

4. Modelling¶

Scoring¶

4.1 Baseline Model¶

1. Majority Classifier¶

2. Simple Logistic Regression¶

3. Single-feature Decision Tree¶

4.2 Benchmark Model & Cross Validation¶

Cross-validation Result Discussion¶

Best benchmark model¶

4.3 Model Improvement¶

1. Boosting with AdaBoostClassifier¶

2. Boosting with GradientBoostingClassifier¶

4. Bagging with RandomForestClassifier¶

5. Bagging with ExtraTreesClassifier¶

5. Conclusion and Suggestion¶

	features	data_type	unique	values_ex
0	Administrative	float64	27	[16.0, 8.0]
1	AdministrativeDuration	float64	3335	[3.0, 54.93333]
2	Informational	float64	17	[3.0, 1.0]
3	InformationalDuration	float64	1258	[609.0, 1.0]
4	ProductRelated	float64	311	[153.0, 63.0]
5	ProductRelatedDuration	float64	9551	[608.4, 1007.9]
6	BounceRates	float64	1727	[0.00713, 0.15]
7	ExitRates	float64	3418	[0.04054, 0.02429]
8	PageValues	float64	2702	[9.22124, 8.33057]
9	SpecialDay	float64	6	[0.2, 1.0]
10	OperatingSystems	float64	8	[7.0, 2.0]
11	Browser	float64	13	[9.0, 13.0]
12	Region	float64	9	[7.0, 3.0]
13	TrafficType	float64	20	[4.0, 20.0]
14	Weekend	float64	2	[1.0, 0.0]
15	Month	float64	10	[10.0, 12.0]
16	VisitorType	float64	3	[3.0, 2.0, 1.0]
17	HasPurchased	float64	2	[1.0, 0.0]

	model	Scaling+model	SMOTE+model	Scaling+SMOTE+model
test_f1	0.568	0.568	0.549	0.550
train_f1	1.000	1.000	1.000	1.000
test_accuracy	0.865	0.865	0.851	0.855
train_accuracy	1.000	1.000	1.000	1.000
test_recall	0.572	0.572	0.586	0.574
train_recall	1.000	1.000	1.000	1.000
test_precision	0.564	0.565	0.518	0.529
train_precision	1.000	1.000	1.000	1.000

	model	Scaling+model	SMOTE+model	Scaling+SMOTE+model
test_f1	0.504	0.503	0.632	0.595
train_f1	0.500	0.505	0.625	0.604
test_accuracy	0.883	0.883	0.877	0.839
train_accuracy	0.882	0.884	0.875	0.843
test_recall	0.386	0.382	0.683	0.765
train_recall	0.381	0.382	0.673	0.771
test_precision	0.730	0.737	0.588	0.487
train_precision	0.730	0.741	0.584	0.497